Data import

## Ambulance Data related to acute cerebrovascular accidents
brain_data <- read_csv("data/raw/brain_data.csv") %>%
   mutate(across(c(Year, Season, Month, DayOff, COVID, Sudden_Start, Dst_level), as.factor))
## Ambulance Long Data related to acute cerebrovascular accidents
brain_data_long <- read_csv("data/raw/brain_data_long.csv") %>%
    mutate(across(c(Year, Season, Month, DayOff, COVID, Sudden_Start, Dst_level, Sex, Age), as.factor))

Correlation of quantitative variables

Our dataset contains a large number of quantitative variables:

# Data preprocessing
brain_data_clear <- brain_data %>% 
  select( where(is.numeric) & !ends_with(c("min", "max", "var", "K_Sum")) ) %>%
  filter(complete.cases(.))

# Function for displaying correlations with a neutral color for values close to 0
color_correlation_fixed <- function(data, mapping, ...) {
  
  x <- eval_data_col(data, mapping$x)
  y <- eval_data_col(data, mapping$y)
  cor_value <- cor(x, y, use = "complete.obs")
  
  # Defining color
  color <- if (abs(cor_value) < 0.1) {
    scales::alpha("gray50", 0.5)           ## Gray for weak correlations
  } else if (cor_value > 0) {
    scales::alpha("blue", abs(cor_value))  ## Blue for positive correlations
  } else {
    scales::alpha("red", abs(cor_value))   ## Red for negative correlations
  }
  
  # Filter for small correlation values
  cor_label <- ifelse(abs(cor_value) < 0.01, "0", sprintf("%.2f", cor_value))
  
  ggplot(data.frame()) + 
    annotate("text", x = 0.5, y = 0.5, label = cor_label, size = 10, color = color ) +
    theme_void() 
}

# Function for trend line with a golden color
golden_smooth <- function(data, mapping, ...) {
  ggplot(data, mapping) +
    geom_point(alpha = 0.4, size = 0.5, color = "black") + 
    geom_smooth(color = "gold", ...) + 
    theme_custom
}

# Plot construction
ggpairs(
  brain_data_clear, 
  progress = FALSE,
  upper = list(continuous = color_correlation_fixed), ## Highlighting correlations
  lower = list(continuous = golden_smooth) )+         ## Golden trend line
theme_custom +
  theme(
    axis.text.x = element_text(size = 10),  
    axis.text.y = element_text(size = 10) )   

When analyzing the network graph, three groups of features can be identified: - Climatic: temperature, pressure, wind, precipitation; - Geomagnetic: Dst index, Ap index; - Solar: F10.7 index, number of sunspots.

cor_data <- brain_data %>% 
  select( where(is.numeric) & !ends_with(c("min", "max", "Sum", "var")) ) %>%
  rename(Temperature = Temp_mean, Precipitation = H2O, 
         `F10.7 index` = F10.7, `Ap index` = Ap_mean, `Dst index` = Dst_mean) %>% 
  cor(use = "pairwise.complete.obs") 
  
corrplot(cor_data, method = 'number', type = 'lower', diag = FALSE)

network_plot(cor_data, min_cor = .1)

Additionally, some features within the groups exhibit a high degree of correlation! - A negative correlation was expected between the daily average temperature and pressure (r = -0.73). - The F10.7 index and the number of sunspots show a high correlation (r = 0.92), as both reflect solar activity but are measured using different methods. - The daily average Ap and Dst indices demonstrate a significant correlation (r = -0.70), as both reflect changes in the Earth’s geomagnetic field. However, the distribution of data along the trend line remains uneven.

brain_data %>% 
  drop_na() %>% 
  ggplot(aes(Temp_mean, Pressure))+
  geom_jitter(alpha = 0.5, colour = "royalblue2", size = 1.5)+
  geom_smooth(method = "lm", se = FALSE, colour = "orangered2", linewidth = 1.2) +
  stat_cor(method = "pearson", label.x.npc = 0.6, label.y.npc = 0.9, size = 7)+
  labs(x = "Temperature")+
  theme_custom

brain_data %>% 
  ggplot(aes(Sunspots, F10.7))+
  geom_jitter(alpha = 0.5, colour = "royalblue2", size = 1.5)+
  geom_smooth(method = "lm", se = FALSE, colour = "orangered2", linewidth = 1.2) +
  stat_cor(method = "pearson", label.x.npc = 0, label.y.npc = 0.9, size = 7)+
  labs(x = "Sunspot Number", y = "F10.7 index")+
  theme_custom

brain_data %>% 
  ggplot(aes(Dst_mean, Ap_mean))+
  geom_jitter(alpha = 0.5, colour = "royalblue2", size = 1.5)+
  geom_smooth(method = "lm", se = FALSE, colour = "orangered2", linewidth = 1.2) +
  stat_cor(method = "pearson", label.x.npc = 0.6, label.y.npc = 0.9, size = 7)+
  labs(x = "Daily Average Dst index", y = "Daily Average Ap index")+
  theme_custom

Based on further analysis, the following variables will be excluded: - Pressure: does not provide full coverage of the data throughout the study period and exhibits less variability compared to Temperature. - F10.7 Index: has an incomplete data set and exhibits less variability compared to Sunspot count. - The Ap and Dst geomagnetic activity indices will not be excluded.

However, all the variables considered do not have a noticeable relationship with the number of ambulance calls:

  brain_data %>% 
  rename(Temperature = Temp_mean, `Ap index` = Ap_mean, `Dst index` = Dst_mean) %>% 
  pivot_longer(cols = c(Temperature, Sunspots, `Ap index`, `Dst index`)) %>% 
  ggplot(aes(x = value, y = EMS))+
  geom_point()+
  geom_smooth(se = FALSE, method = "gam", colour = "goldenrod2", linewidth = 2)+
  theme_custom +
  labs(x = "", y = "Daily Emergency Calls") +  
  facet_wrap(~name, scales = "free")

Data Decomposition

The number of sunspots is directly linked to the solar activity cycles, as confirmed by the graph:

brain_data %>% 
  select(!c(Pressure, F10.7, Ap_mean)) %>% 
  ggplot(aes(Date, Sunspots))+
  geom_line(alpha = 0.3, colour = "royalblue2")+
  geom_smooth(method = "gam", formula = y ~ s(x, k = 10),  se = FALSE,
              colour = "orangered4", linewidth = 2) +
  geom_vline(xintercept = as.Date("2009-01-01"), 
             colour = "orangered", linewidth = 1, linetype = "dashed")+
  geom_vline(xintercept = as.Date("2019-12-31"), 
             colour = "orangered", linewidth = 1, linetype = "dashed")+
  scale_x_date(date_breaks = "2 year", date_labels = "%Y")+
  labs(x = "", y = "Sunspot number")+
  theme_custom

Our data covers the complete 24th solar cycle, spanning the period from December 2008 to December 2019.

Working with time series allows us to decompose the data to identify trends and cyclic patterns:

ggarrange(

  brain_data %>% filter(Date >=  ymd("2007-01-01")) %>% pull(Sunspots) %>% 
  ts(start = 2007, frequency = 365) %>% stl(s.window = "periodic") %>% 
  autoplot() + theme_custom + ggtitle("Sunspots"), 

  brain_data %>% filter(Date >=  ymd("2007-01-01")) %>% pull(EMS) %>% 
  ts(start = 2007, frequency = 365) %>% stl(s.window = "periodic") %>% 
  autoplot() + theme_custom + ggtitle("EMS"), 

  brain_data %>% filter(Date >=  ymd("2007-01-01")) %>% pull(Temp_mean) %>% 
  ts(start = 2007, frequency = 365) %>% stl(s.window = "periodic") %>% 
  autoplot() + theme_custom + ggtitle("Temperature"), 

  nrow = 1
)

ggarrange(

  brain_data %>% filter(Date >=  ymd("2007-01-01")) %>% pull(Sunspots) %>% 
  ts(start = 2007, frequency = 365) %>% stl(s.window = "periodic") %>% 
  autoplot() + theme_custom + ggtitle("Sunspots"), 

  brain_data %>% filter(Date >=  ymd("2007-01-01")) %>% pull(Ap_mean) %>% 
  ts(start = 2007, frequency = 365) %>% stl(s.window = "periodic") %>% 
  autoplot() + theme_custom + ggtitle("Ap Index"), 

  brain_data %>% filter(Date >=  ymd("2007-01-01")) %>% pull(Dst_var) %>% 
  ts(start = 2007, frequency = 365) %>% stl(s.window = "periodic") %>% 
  autoplot() + theme_custom + ggtitle("Dst amplitude"), 
  
  nrow = 1
)

- After decomposing the emergency calls data, a linear increase over time is observed. However, since 2020, a noticeable decrease is apparent, likely associated with the COVID-19 pandemic. - Climatic data (especially temperature), after decomposition, show a clear cyclic pattern. - The daily Dst index does not show any clear trends or cycles. However, the daily fluctuation amplitude of the Dst index, after decomposition, exhibits a trend similar to the dynamics of sunspot numbers. Nevertheless, since noise is the dominant component, the results should not be taken too seriously.

Visualizing Association

Dst ~ Sunspots

brain_data %>%
  filter(Dst_level != "Severe") %>%
  mutate(Dst_level = fct_relevel(Dst_level, c("Quiet", "Weak", "Moderate", "Major", "Severe"))) %>% 
  ggplot(aes(Dst_level, Sunspots))+
  geom_jitter(alpha = 0.5, size = 1, width = 0.2) +  
  geom_boxplot(alpha = 0.5, outliers = FALSE, fill = "goldenrod2", linewidth = 1)+
  labs( x = "Geomagnetic activity level",  y = "Sunspot Number" ) +
  theme_custom

DayOff ~ EMS

mean_values <- brain_data %>%
  group_by(DayOff) %>%
  summarise(mean_EMS = round(mean(EMS, na.rm = TRUE), 2), .groups = "drop")

ggplot(brain_data, aes(x = DayOff, y = EMS)) +
  geom_boxplot(aes(fill = DayOff)) +
  geom_text(data = mean_values, 
            aes(x = DayOff, y = mean_EMS, label = mean_EMS), 
            color = "black", size = 8, fontface = "bold", vjust = -1) +
  scale_fill_brewer(palette = "Dark2", direction = -1)+
  scale_y_continuous(limits = c(NA, max(brain_data$EMS) + 3))+
  labs(y = "Daily Emergency Calls", x = "Non-working Days")+
  theme_custom +
  theme(legend.position = "none")+
  stat_pvalue_manual(t_test(data = brain_data,  EMS ~ DayOff),
                     label = "T-test, p = {p}", 
                     size = 10, 
                     y.position = max(brain_data$EMS) + 0.1)

- The difference between the average EMS values for weekdays and weekends is statistically significant. - The average EMS value is higher on weekdays (13.33) compared to weekends (11.02).

Season ~ EMS

brain_data %>% 
mutate(Season = fct_relevel(Season, c("Winter", "Spring", "Summer", "Autumn"))) %>% 
ggplot(aes(x = Season, y = EMS, fill = Season)) +
  geom_boxplot(alpha = 0.7, show.legend = FALSE) +
  scale_fill_manual(
    values = c("Winter" = "dodgerblue2", "Spring" = "palegreen2", "Summer" = "tan2","Autumn" = "tomato2")) +
  labs(y = "Daily Emergency Calls", x = "")+
  theme_custom 

brain_data %>%
  pairwise_t_test(EMS ~ Season, p.adjust.method = "holm") %>% 
  transmute(`Group 1` = group1, `Group 2` = group2, 
            `p-value` = p, `p.adjusted` = p.adj) %>%
  flextable() 

Group 1

Group 2

p-value

p.adjusted

Autumn

Spring

0.0162

0.0969

Autumn

Summer

0.1300

0.5200

Spring

Summer

0.3670

1.0000

Autumn

Winter

0.0259

0.1290

Spring

Winter

0.8650

1.0000

Summer

Winter

0.4670

1.0000

The distribution of emergency medical service calls across seasons is statistically insignificant (paired t-test with Holm p-value adjustment method: p.adjusted > 0.05).

Geomagnetic indices

brain_data %>% 
  drop_na(Kp_Sum, K_Sum) %>% 
  ggplot(aes(Kp_Sum, K_Sum))+
  geom_jitter(alpha = 0.5, colour = "royalblue2", size = 1.5)+
  geom_smooth(method = "lm", se = FALSE, colour = "orangered2", linewidth = 1.2) +
  stat_cor(method = "pearson", label.x.npc = 0, label.y.npc = 0.9, size = 7)+
  labs(x = "Daily Average Kp index", y = "Daily Average K index (IRT)")+
  theme_custom

brain_data %>%
  drop_na(Kp_Sum, K_Sum) %>% 
  pivot_longer(cols = c(Kp_Sum, K_Sum), names_to = "index") %>% 
  ggplot() +
  geom_histogram(aes(x = value, fill = index), 
                 binwidth = 5, colour = "black", alpha = 0.5, position = "identity", show.legend = FALSE) +
  scale_fill_manual(
    values = c("K_Sum" = "darkseagreen2", "Kp_Sum" = "steelblue2") )+
  scale_x_continuous(breaks = seq(0, 50, 10))+
  labs( x = "Index Values", y = "Observation Count", fill = "Index Type") +
  theme_custom +
  facet_wrap(~index, labeller = labeller(index = c("K_Sum" = "Daily Average K index (IRT)",
                                                   "Kp_Sum" = "Daily Average Kp index")))

Daily Average Kp index and Daily Average K index (IRT) show a strong correlation (r = 0.91). However, K index (IRT) has a smaller range and a more uniform distribution compared to Kp index.

Age & Sex ~ EMS

ggarrange(

brain_data_long %>%
  filter(between(as.numeric(as.character(Year)), 2007, 2021), Age != "55+") %>% 
  group_by(Year, Sex, Age) %>%
  summarise(EMS_total = sum(EMS, na.rm = TRUE), .groups = "drop") %>%
  mutate(EMS_plot = ifelse(Sex == "Male", -EMS_total, EMS_total)) %>% 
  ggplot(aes(x = Year, y = EMS_plot, fill = Sex)) +
  geom_bar(stat = "identity",  alpha = 0.7, colour = "black", show.legend = FALSE) +
  scale_y_continuous(labels = abs, n.breaks = 10) + 
  scale_fill_manual(
    values = c("Male" = "steelblue", "Female" = "pink"),
    labels = c("Male" = "Male", "Female" = "Female") ) +
  coord_flip() +
  labs(
    title = "Ambulance Calls Distribution by Year, Age and Gender",
    x = "", y = "", fill = "Gender") +
  theme_custom + 
  facet_wrap(. ~ Age,  scales = "free_x", 
              labeller = labeller(Age = c("25-44" = "Age: 25-44 years", 
                                          "45-54" = "Age: 45-54 years"))), 

brain_data_long %>%
  filter(between(as.numeric(as.character(Year)), 2007, 2021), Age == "55+") %>% 
  group_by(Year, Sex, Age) %>%
  summarise(EMS_total = sum(EMS, na.rm = TRUE), .groups = "drop") %>%
  mutate(EMS_plot = ifelse(Sex == "Male", -EMS_total, EMS_total)) %>% 
  ggplot(aes(x = Year, y = EMS_plot, fill = Sex)) +
  geom_bar(stat = "identity",  alpha = 0.7, colour = "black", ) +
  scale_y_continuous(labels = abs, n.breaks = 10) + 
  scale_fill_manual(
    values = c("Male" = "steelblue", "Female" = "pink"),
    labels = c("Male" = "Male", "Female" = "Female") ) +
  coord_flip() +
  labs(
    x = "", y = "Total EMS", fill = "Gender") +
  theme_custom + theme(legend.position = "inside", legend.justification = c(0.9, 0.2))+
  facet_wrap(. ~ Age,  scales = "free_x", 
              labeller = labeller(Age = c("55+" = "Age: 55+ years"))), 

nrow = 2

)